set -x


export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000
export UMD_SUBMITINTERVAL=1
export UMD_KMTDIRECTACCESS=1


## w4a8 TN V2
export VLLM_W8A8_MOE_USE_W4A8=1
export VLLM_MLA_DISABLE=0
model_id=/home/weights/DeepSeek-R1-int4-tnv2

#export  CUDA_VISIBLE_DEVICES=0,1,2,3,8,9,10,11,12,13,14,15,4,5,6,7

export VLLM_PP_LAYER_PARTITION="18,16,15,12"
vllm serve $model_id \
--pipeline-parallel-size 4 \
--tensor-parallel-size 4 \
--trust-remote-code \
--max-model-len $[1024*50]  \
--max_num_batched_tokens $[1024*6] \
--gpu-memory-utilization 0.94 \
--max_num_seqs 16 \
--disable_log_requests 
